This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
rm(list = ls(all=TRUE)) # clear global environment
library(tidyr) # data loading
library(dplyr) # dataframe manipulation
library(stringr) # string manipulation
library(ggplot2) # plotting
path <- file.path("C:","Users","Paulina-laptop","Desktop","R_Programming","RLadies_workshop")
setwd(path)
sprintf("Current directory: %s", getwd())
## [1] "Current directory: C:/Users/Paulina-laptop/Desktop/R_Programming/RLadies_workshop"
Let’s load some data first
wells <- read.csv(unzip("BCOGC_data/prod_csv.zip", "zone_prd_2016_to_present.csv"), skip = 1)
head(wells)
## Wa_num Compltn_event_seq Prod_period UWI Area_code Formtn_code
## 1 29 0 201601 100142108318W600 3600 6200
## 2 29 0 201603 100142108318W600 3600 6200
## 3 29 0 201611 100142108318W600 3600 6200
## 4 29 0 201612 100142108318W600 3600 6200
## 5 29 0 201701 100142108318W600 3600 6200
## 6 29 0 201702 100142108318W600 3600 6200
## Pool_seq Gas_prod_vol..e3m3. Oil_prod_vol..m3. Water_prod_vol..m3.
## 1 A 107.7 0 2.3
## 2 A 45.2 0 1.7
## 3 A 166.2 0 5.0
## 4 A 159.2 0 2.6
## 5 A 133.5 0 3.7
## 6 A 95.8 0 4.1
## Cond_prod_vol..m3. Prod_days. Gas_prod_cum..e3m3. Oil_prod_cum..m3.
## 1 0.5 31.0 107.7 0
## 2 0.3 13.0 152.9 0
## 3 1.3 22.0 319.1 0
## 4 0.5 31.0 478.3 0
## 5 0.6 31.0 611.8 0
## 6 0.5 27.8 707.6 0
## Water_prod_cum..m3. Cond_prod_cum..m3. Project_code
## 1 2.3 0.5 0
## 2 4.0 0.8 0
## 3 9.0 2.1 0
## 4 11.6 2.6 0
## 5 15.3 3.2 0
## 6 19.4 3.7 0
Column names in the original file have inconvenient interpunction, which can be easily fixed:
names(wells)
## [1] "Wa_num" "Compltn_event_seq" "Prod_period"
## [4] "UWI" "Area_code" "Formtn_code"
## [7] "Pool_seq" "Gas_prod_vol..e3m3." "Oil_prod_vol..m3."
## [10] "Water_prod_vol..m3." "Cond_prod_vol..m3." "Prod_days."
## [13] "Gas_prod_cum..e3m3." "Oil_prod_cum..m3." "Water_prod_cum..m3."
## [16] "Cond_prod_cum..m3." "Project_code"
names(wells) <- str_replace_all(names(wells), c(" " = "" ,
"," = "",
"\\("="_",
"\\)"="",
"\\.."="_",
"\\."=""
))
names(wells)
## [1] "Wa_num" "Compltn_event_seq" "Prod_period"
## [4] "UWI" "Area_code" "Formtn_code"
## [7] "Pool_seq" "Gas_prod_vol_e3m3" "Oil_prod_vol_m3"
## [10] "Water_prod_vol_m3" "Cond_prod_vol_m3" "Prod_days"
## [13] "Gas_prod_cum_e3m3" "Oil_prod_cum_m3" "Water_prod_cum_m3"
## [16] "Cond_prod_cum_m3" "Project_code"
wells <- subset(wells, select=c(Wa_num, Prod_period, Prod_days, Area_code, Formtn_code, Gas_prod_vol_e3m3, Oil_prod_vol_m3, Water_prod_vol_m3, Cond_prod_vol_m3)) # select only useful columns
area_codes <- read.table("BCOGC_data/ogc_area_codes.csv",
sep = ",", skip = 1, header = TRUE, stringsAsFactors = FALSE)
head(area_codes, 3)
## Area.Code Area.Name
## 1 50 ADSETT
## 2 100 AIRPORT
## 3 200 AITKEN CREEK
# rename so that we have the same name of Area_code and Area_name in all columns
area_codes <- rename(area_codes, Area_code = Area.Code, Area_name = Area.Name)
formation_codes <- read.table("BCOGC_data/ogc_formation_codes.csv",
sep = ",", skip = 1, header = TRUE, stringsAsFactors = FALSE)
head(formation_codes, 3)
## Formation.Code Formation.Name
## 1 4610 A MARKER/BASE OF LIME
## 2 2703 AITKEN CREEK
## 3 9922 ALDRIDGE
formation_codes <- rename(formation_codes, Formtn_code = Formation.Code, Formtn_name = Formation.Name)
head(area_codes,3)
## Area_code Area_name
## 1 50 ADSETT
## 2 100 AIRPORT
## 3 200 AITKEN CREEK
head(formation_codes,3)
## Formtn_code Formtn_name
## 1 4610 A MARKER/BASE OF LIME
## 2 2703 AITKEN CREEK
## 3 9922 ALDRIDGE
Merge dataframes to create 2 additional columns with area and name as the wells file contains only its numerical codes.
wells <- merge(wells, area_codes, by = 'Area_code')
wells <- merge(wells, formation_codes, by = 'Formtn_code')
# we don't need them anymore in memory and dataframe
rm(area_codes, formation_codes) # remove form memory
wells <- subset(wells, select = -c(Area_code, Formtn_code)) # remove from dataframe
First glimpse on the data. Notice newly created columns, Area_name and Formtn_name at the end.
names(wells)
## [1] "Wa_num" "Prod_period" "Prod_days"
## [4] "Gas_prod_vol_e3m3" "Oil_prod_vol_m3" "Water_prod_vol_m3"
## [7] "Cond_prod_vol_m3" "Area_name" "Formtn_name"
str(wells)
## 'data.frame': 517518 obs. of 9 variables:
## $ Wa_num : int 14893 25370 25370 26962 14893 14893 25371 25371 25371 26962 ...
## $ Prod_period : int 201809 202003 201901 201709 201805 202001 202011 201912 201805 201907 ...
## $ Prod_days : num 30 30.7 30.8 23 31 31 28.5 30.4 30.4 28 ...
## $ Gas_prod_vol_e3m3: num 0 0 0 0 0 0 0 0 0 0 ...
## $ Oil_prod_vol_m3 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Water_prod_vol_m3: num 91.4 1291 2528 1068 6948.7 ...
## $ Cond_prod_vol_m3 : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Area_name : chr "DESAN" "PEEJAY WEST" "PEEJAY WEST" "BEAVERTAIL" ...
## $ Formtn_name : chr "QUATERNARY" "QUATERNARY" "QUATERNARY" "QUATERNARY" ...
summary(wells)
## Wa_num Prod_period Prod_days Gas_prod_vol_e3m3
## Min. : 29 Min. :201601 Min. : 0.00 Min. : 0.0
## 1st Qu.:15183 1st Qu.:201704 1st Qu.:25.60 1st Qu.: 38.2
## Median :22328 Median :201807 Median :30.00 Median : 124.8
## Mean :21074 Mean :201810 Mean :26.08 Mean : 555.4
## 3rd Qu.:28257 3rd Qu.:201910 3rd Qu.:31.00 3rd Qu.: 608.5
## Max. :41055 Max. :202101 Max. :31.10 Max. :119042.3
## Oil_prod_vol_m3 Water_prod_vol_m3 Cond_prod_vol_m3 Area_name
## Min. : 0.00 Min. : 0.0 Min. : 0.00 Length:517518
## 1st Qu.: 0.00 1st Qu.: 0.9 1st Qu.: 0.00 Class :character
## Median : 0.00 Median : 5.0 Median : 0.00 Mode :character
## Mean : 10.77 Mean : 181.4 Mean : 19.31
## 3rd Qu.: 0.00 3rd Qu.: 52.6 3rd Qu.: 1.00
## Max. :7089.30 Max. :29177.7 Max. :7424.00
## Formtn_name
## Length:517518
## Class :character
## Mode :character
##
##
##
# Let's check number of formations in the file...
length(unique(wells$Formtn_name))
## [1] 92
# ...and list them
unique(wells$Formtn_name)
## [1] "QUATERNARY" "CRETACEOUS"
## [3] "CARDIUM SAND" "DOE CREEK"
## [5] "DUNVEGAN" "BASE OF FISH SCALES"
## [7] "SCATTER" "PADDY"
## [9] "PADDY- CADOTTE" "CADOTTE"
## [11] "SPIRIT RIVER" "NOTIKEWIN"
## [13] "FALHER" "FALHER A"
## [15] "FALHER B" "FALHER C"
## [17] "FALHER D" "WILRICH"
## [19] "BLUESKY" "BASAL BLUESKY"
## [21] "BLUESKY-GETHING" "BLUESKY-GETHING-DETRITAL"
## [23] "DETRITAL" "GETHING"
## [25] "LOWER GETHING" "BASAL GETHING"
## [27] "GETHING-BALDONNEL" "CADOMIN"
## [29] "CHINKEH" "NIKANASSIN"
## [31] "DUNLEVY" "LOWER DUNLEVY"
## [33] "ROCK CREEK" "NORDEGG"
## [35] "NORDEGG-BALDONNEL" "PARDONET-BALDONNEL"
## [37] "BALDONNEL" "BALDONNEL/UPPER CHARLIE LAKE"
## [39] "CHARLIE LAKE" "SIPHON"
## [41] "CECIL" "NANCY"
## [43] "FLATROCK" "BOUNDARY LAKE"
## [45] "YELLOW MARKER" "COPLIN"
## [47] "MICA" "BLUEBERRY"
## [49] "INGA" "NORTH PINE"
## [51] "BEAR FLAT" "PINGEL"
## [53] "LIMESTONE A BED" "A MARKER/BASE OF LIME"
## [55] "LOWER CHARLIE LAKE SANDS" "ARTEX"
## [57] "ARTEX/HALFWAY" "UPPER HALFWAY"
## [59] "HALFWAY" "LOWER HALFWAY"
## [61] "DOIG" "DOIG PHOSPHATE BEDS"
## [63] "BLUESKY-GETHING-MONTNEY" "LOWER CHARLIE LAKE/MONTNEY"
## [65] "DOIG PHOSPHATE-MONTNEY" "MONTNEY"
## [67] "BELLOY" "BELCOURT"
## [69] "BELCOURT-TAYLOR FLAT" "BELLOY-KISKATINAW"
## [71] "TAYLOR FLAT" "MISSISSIPPIAN"
## [73] "KISKATINAW" "LOWER KISKATINAW"
## [75] "BASAL KISKATINAW" "UPPER DEBOLT"
## [77] "DEBOLT" "ELKTON"
## [79] "SHUNDA" "PEKISKO"
## [81] "BANFF" "BESA RIVER"
## [83] "WABAMUN" "TETCHO"
## [85] "KAKISA" "JEAN MARIE"
## [87] "MUSKWA" "MUSKWA-OTTER PARK"
## [89] "SLAVE POINT" "SULPHUR POINT"
## [91] "EVIE" "PINE POINT"
slave_pnt <- filter(wells, Formtn_name == "SLAVE POINT" )
unique(slave_pnt$Formtn_name)
## [1] "SLAVE POINT"
muskwa <- filter(wells, Formtn_name %in% c("MUSKWA", "MUSKWA-OTTER PARK"))
unique(muskwa$Formtn_name)
## [1] "MUSKWA" "MUSKWA-OTTER PARK"
# same as
muskwa <- filter(wells, Formtn_name == "MUSKWA" | Formtn_name == "MUSKWA-OTTER PARK")
unique(muskwa$Formtn_name)
## [1] "MUSKWA" "MUSKWA-OTTER PARK"
Wa_num_sorted <- arrange(wells, Wa_num) # ascending
head(Wa_num_sorted)
## Wa_num Prod_period Prod_days Gas_prod_vol_e3m3 Oil_prod_vol_m3
## 1 29 201703 30.0 73.0 0
## 2 29 201704 24.0 64.7 0
## 3 29 201706 3.0 7.2 0
## 4 29 201707 15.1 53.8 0
## 5 29 201611 22.0 166.2 0
## 6 29 201601 31.0 107.7 0
## Water_prod_vol_m3 Cond_prod_vol_m3 Area_name Formtn_name
## 1 2.8 0.3 FORT ST JOHN BELLOY
## 2 1.9 0.3 FORT ST JOHN BELLOY
## 3 0.3 0.1 FORT ST JOHN BELLOY
## 4 2.0 0.5 FORT ST JOHN BELLOY
## 5 5.0 1.3 FORT ST JOHN BELLOY
## 6 2.3 0.5 FORT ST JOHN BELLOY
Wa_num_sorted <- arrange(wells, desc(Wa_num)) # descending
head(Wa_num_sorted)
## Wa_num Prod_period Prod_days Gas_prod_vol_e3m3 Oil_prod_vol_m3
## 1 41055 202012 30.6 15051.0 0
## 2 41055 202101 30.8 12528.5 0
## 3 41055 202011 5.5 1293.3 0
## 4 41054 202011 5.6 1658.9 0
## 5 41054 202012 30.8 19143.3 0
## 6 41054 202101 30.8 17839.7 0
## Water_prod_vol_m3 Cond_prod_vol_m3 Area_name Formtn_name
## 1 3630.5 950.8 HERITAGE MONTNEY
## 2 2087.3 770.4 HERITAGE MONTNEY
## 3 3396.8 110.8 HERITAGE MONTNEY
## 4 3444.7 317.9 HERITAGE MONTNEY
## 5 4204.7 1691.9 HERITAGE MONTNEY
## 6 2476.8 1170.2 HERITAGE MONTNEY
Wa_num_sorted <- arrange(wells, desc(Wa_num), Prod_period) # by multiple conditions
head(Wa_num_sorted)
## Wa_num Prod_period Prod_days Gas_prod_vol_e3m3 Oil_prod_vol_m3
## 1 41055 202011 5.5 1293.3 0
## 2 41055 202012 30.6 15051.0 0
## 3 41055 202101 30.8 12528.5 0
## 4 41054 202011 5.6 1658.9 0
## 5 41054 202012 30.8 19143.3 0
## 6 41054 202101 30.8 17839.7 0
## Water_prod_vol_m3 Cond_prod_vol_m3 Area_name Formtn_name
## 1 3396.8 110.8 HERITAGE MONTNEY
## 2 3630.5 950.8 HERITAGE MONTNEY
## 3 2087.3 770.4 HERITAGE MONTNEY
## 4 3444.7 317.9 HERITAGE MONTNEY
## 5 4204.7 1691.9 HERITAGE MONTNEY
## 6 2476.8 1170.2 HERITAGE MONTNEY
rm(Wa_num_sorted)
names(wells)
## [1] "Wa_num" "Prod_period" "Prod_days"
## [4] "Gas_prod_vol_e3m3" "Oil_prod_vol_m3" "Water_prod_vol_m3"
## [7] "Cond_prod_vol_m3" "Area_name" "Formtn_name"
head(select(wells, Area_name, Formtn_name))
## Area_name Formtn_name
## 1 DESAN QUATERNARY
## 2 PEEJAY WEST QUATERNARY
## 3 PEEJAY WEST QUATERNARY
## 4 BEAVERTAIL QUATERNARY
## 5 DESAN QUATERNARY
## 6 DESAN QUATERNARY
# instead of listing columns separately, we can list multiple columns next to each other using ":"
head(select(wells, Gas_prod_vol_e3m3:Cond_prod_vol_m3, Area_name, Formtn_name))
## Gas_prod_vol_e3m3 Oil_prod_vol_m3 Water_prod_vol_m3 Cond_prod_vol_m3
## 1 0 0 91.4 0
## 2 0 0 1291.0 0
## 3 0 0 2528.0 0
## 4 0 0 1068.0 0
## 5 0 0 6948.7 0
## 6 0 0 109.3 0
## Area_name Formtn_name
## 1 DESAN QUATERNARY
## 2 PEEJAY WEST QUATERNARY
## 3 PEEJAY WEST QUATERNARY
## 4 BEAVERTAIL QUATERNARY
## 5 DESAN QUATERNARY
## 6 DESAN QUATERNARY
The same wells have been listed multiple times as there are various production periods each year. Let’s create new column Prod_year to make more general summary.
head(wells$Prod_period)
## [1] 201809 202003 201901 201709 201805 202001
wells <- wells %>%
mutate(Prod_year = substr(Prod_period, 1, 4), # first 4 characters
Prod_month = substr(Prod_period, 5, 6), # 5th and 6th character
Prod_ym = paste(Prod_year, Prod_month, sep="-")) #to create "time points" for plotting
head(wells)
## Wa_num Prod_period Prod_days Gas_prod_vol_e3m3 Oil_prod_vol_m3
## 1 14893 201809 30.0 0 0
## 2 25370 202003 30.7 0 0
## 3 25370 201901 30.8 0 0
## 4 26962 201709 23.0 0 0
## 5 14893 201805 31.0 0 0
## 6 14893 202001 31.0 0 0
## Water_prod_vol_m3 Cond_prod_vol_m3 Area_name Formtn_name Prod_year
## 1 91.4 0 DESAN QUATERNARY 2018
## 2 1291.0 0 PEEJAY WEST QUATERNARY 2020
## 3 2528.0 0 PEEJAY WEST QUATERNARY 2019
## 4 1068.0 0 BEAVERTAIL QUATERNARY 2017
## 5 6948.7 0 DESAN QUATERNARY 2018
## 6 109.3 0 DESAN QUATERNARY 2020
## Prod_month Prod_ym
## 1 09 2018-09
## 2 03 2020-03
## 3 01 2019-01
## 4 09 2017-09
## 5 05 2018-05
## 6 01 2020-01
class(wells$Prod_ym)
## [1] "character"
head(wells$Prod_period)
## [1] 201809 202003 201901 201709 201805 202001
prod_days_by_area <- wells %>%
group_by(Area_name) %>%
summarize(
total_prod_days = sum(Prod_days)
)
head(prod_days_by_area)
## # A tibble: 6 x 2
## Area_name total_prod_days
## <chr> <dbl>
## 1 ADSETT 6385.
## 2 AIRPORT 141.
## 3 AITKEN CREEK 4164.
## 4 AITKEN CREEK NORTH 1885.
## 5 ALTARES 14861.
## 6 BEAVERDAM 6439.
prod_days_by_area %>% filter(total_prod_days > 500000) %>%
ggplot(aes(x = Area_name, y = total_prod_days)) +
geom_col() +
ggtitle("Total Production Days for most productive areas", ) +
ylab("Production days") +
xlab("Area")
Histograms are commonly used to visualize the distribution of our data. The bars of histogram correspond to the number of observations within specific bin. Note: it’s recommended to check different bin sizes to find the most optimal bin for the data.
wells %>% filter(Cond_prod_vol_m3 > 0) %>%
ggplot() +
geom_histogram(mapping = aes(Cond_prod_vol_m3))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# let's limit our data to the wells which had production (100-1000). Here, we use default number of bins (30)
wells %>% filter(Cond_prod_vol_m3 > 100, Cond_prod_vol_m3 < 1000) %>%
ggplot() +
geom_histogram(mapping = aes(Cond_prod_vol_m3))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# increase number of bins to 200 to see the difference
wells %>% filter(Cond_prod_vol_m3 > 100, Cond_prod_vol_m3 < 1000) %>%
ggplot() +
geom_histogram(mapping = aes(Cond_prod_vol_m3), bins=200) +
ggtitle("Distribution of condensate production per production period") +
xlab(expression("Condensate production per production period m"^3))
Tables are a nice way to present numerical data and identify the trends. Let’s analyze the water production in different formations.
# Using tables we can see how many wells we have for each formation
wells %>% group_by(Formtn_name) %>%
summarize(
count_wells = n_distinct(Wa_num),#count unique well numbers (multiple prod. periods)
mean_wtr_prod = mean(Water_prod_vol_m3)
) %>% arrange(desc(count_wells))
## # A tibble: 92 x 3
## Formtn_name count_wells mean_wtr_prod
## <chr> <int> <dbl>
## 1 MONTNEY 4745 150.
## 2 JEAN MARIE 1531 3.21
## 3 HALFWAY 659 62.2
## 4 BLUESKY 589 2038.
## 5 CADOMIN 535 10.7
## 6 NOTIKEWIN 406 5.03
## 7 BALDONNEL 356 62.3
## 8 BLUESKY-GETHING-MONTNEY 354 1.34
## 9 GETHING 330 2.33
## 10 BOUNDARY LAKE 278 483.
## # ... with 82 more rows
# we can find which areas / formations are suspected to have more water disposal wells than HC production wells
wells %>% group_by(Area_name) %>%
summarize(
count_wells = n_distinct(Wa_num),
mean_wtr_prod = mean(Water_prod_vol_m3)
) %>% arrange(desc(mean_wtr_prod))
## # A tibble: 169 x 3
## Area_name count_wells mean_wtr_prod
## <chr> <int> <dbl>
## 1 HAY RIVER 211 4481.
## 2 KLUA 1 4328.
## 3 SUNRISE 14 2990.
## 4 TOWER LAKE 7 2458.
## 5 CLARKE LAKE 38 2145.
## 6 LAPP 9 1699.
## 7 GOPHER 1 1527.
## 8 WOODRUSH 8 867.
## 9 TWO RIVERS 9 708.
## 10 LIARD 6 700.
## # ... with 159 more rows
Scatterplots are useful when we want to show the relationship between 2 variables.
We can convert our dataframe to the long format to have all types of hydrocarbon production in one column using gather function. Picture below visualizes this:
# let's look again on the selected columns of original dataframe
wells %>% select(Water_prod_vol_m3, Gas_prod_vol_e3m3, Oil_prod_vol_m3, Cond_prod_vol_m3) %>%
head(3)
## Water_prod_vol_m3 Gas_prod_vol_e3m3 Oil_prod_vol_m3 Cond_prod_vol_m3
## 1 91.4 0 0 0
## 2 1291.0 0 0 0
## 3 2528.0 0 0 0
# we will use gather function to create new column "Prod_type" which will contain the "Prod_volume" value for gas, oil and condensate production. We assign the results to new dataframe.
prod_df <- wells %>%
gather(key = "Prod_type",
value = "Prod_volume",
c(Gas_prod_vol_e3m3, Oil_prod_vol_m3, Cond_prod_vol_m3)) %>%
select(Water_prod_vol_m3, Prod_volume, Prod_year, Prod_ym, Prod_type, Prod_days, Formtn_name)
prod_df %>% select(Water_prod_vol_m3, Prod_type) %>% head(3)
## Water_prod_vol_m3 Prod_type
## 1 91.4 Gas_prod_vol_e3m3
## 2 1291.0 Gas_prod_vol_e3m3
## 3 2528.0 Gas_prod_vol_e3m3
# indeed, we have production type in Prod_type column!
unique(prod_df$Prod_type)
## [1] "Gas_prod_vol_e3m3" "Oil_prod_vol_m3" "Cond_prod_vol_m3"
# let's visualize that quickly
prod_df %>%
filter(Prod_year == 2021) %>%
ggplot(aes(x = Water_prod_vol_m3, y = Prod_volume, colour = Prod_type)) +
geom_point()
# add some axis limits
prod_df %>%
filter(Prod_year == 2021) %>%
ggplot(aes(x = Water_prod_vol_m3, y = Prod_volume, colour = Prod_type)) +
geom_point() +
xlim(0,10000) +
ylim(0,25000)
## Warning: Removed 45 rows containing missing values (geom_point).
# visualize each type of production separately
prod_df %>%
filter(Prod_year == 2021) %>%
ggplot(aes(x = Water_prod_vol_m3, y = Prod_volume, colour = Prod_type)) +
geom_point() +
facet_wrap(~Prod_type)
# create indepentent y axis for each chart, since the production is in different units.
prod_df %>%
filter(Prod_year == 2021) %>%
ggplot(aes(x = Water_prod_vol_m3, y = Prod_volume, colour = Prod_type, alpha = 0.3)) +
geom_point() +
facet_wrap(~Prod_type, scales = "free") # scales free => not fixed limits of yaxis
# I expect that Prod_volume = 0 correspond to Saltwater Disposal wells, so let's remove those from the plot.
prod_df %>%
filter(Prod_year == 2021, Prod_volume > 0) %>% # Prod_colume > 0 to remove disposal wells
ggplot(aes(x = Water_prod_vol_m3, y = Prod_volume, colour = Prod_type, alpha = 0.3)) +
geom_point() +
ggtitle("Water production per HC type") +
facet_wrap(~Prod_type, scales = "free") +
xlim(0,5000)
## Warning: Removed 126 rows containing missing values (geom_point).
Bar charts can be created in using two geometries: geom_col - by default height of the bars represent value in the data geom_bar - by default counts number of cases in each group
### Let's start with geom_col to visualize the production per period...
wells %>% subset(Prod_year > 2000) %>%
group_by(Prod_ym) %>%
summarise(
prod_per_period = sum(Cond_prod_vol_m3)
) %>%
ggplot(aes(x = Prod_ym, y = prod_per_period)) +
theme(axis.text.x = element_text(angle = 90)) +
geom_col()
### ...and per year. Which one is more useful / interesting?
wells %>%
subset(Prod_year > 2000) %>%
group_by(Prod_year) %>%
summarise(
prod_per_year = sum(Cond_prod_vol_m3)
) %>%
ggplot(aes(x = Prod_year, y = prod_per_year)) +
geom_col()
We can apply simple trick to order the formations from the highest production to the lowest (using mutate function).
# first let's create dataframe with summed produced values by formation
prod_by_formation <- wells %>%
group_by(Formtn_name) %>%
summarise(
oil_prod_total = sum(Oil_prod_vol_m3),
gas_prod_total = sum(Gas_prod_vol_e3m3),
cond_prod_total = sum(Cond_prod_vol_m3),
water_prod_total = sum(Water_prod_vol_m3)
) %>%
ungroup()
# find 5 most productive gas formation
most_gas <- prod_by_formation %>%
arrange(desc(gas_prod_total)) %>%
head(5)
ggplot(most_gas, aes(x = Formtn_name, y = gas_prod_total)) +
geom_col(fill='#ff6347') # change color to red
# we can use a trick to order the bars according to production value (using mutate function)
most_gas <- most_gas %>%
mutate(Formtn_name=factor(Formtn_name, levels=Formtn_name))
ggplot(most_gas, aes(x = Formtn_name, y = gas_prod_total)) +
geom_col(fill='#ff6347') # change color to red
# same for oil production
most_oil <- prod_by_formation %>%
arrange(desc(oil_prod_total)) %>%
head(5) %>%
mutate(Formtn_name=factor(Formtn_name, levels=Formtn_name))
ggplot(most_oil, aes(x = Formtn_name, y = oil_prod_total)) +
geom_col()
# and for condensate production
most_cond <- prod_by_formation %>%
arrange(desc(cond_prod_total)) %>%
head(5) %>%
mutate(Formtn_name=factor(Formtn_name, levels=Formtn_name))
ggplot(most_gas, aes(x = Formtn_name, y = gas_prod_total)) +
geom_col(fill="#6495ed")
We can also visualize the same set of formations. To define the set, we will take 5 most productive gas, oil and condensate formations.
# define 5 most productive formations
best_formations <- unique(most_oil$Formtn_name,
most_gas$Formtn_name,
most_cond$Formtn_name)
# plot oil production
wells %>%
subset(Formtn_name %in% best_formations) %>%
ggplot(aes(x = Formtn_name, y = Oil_prod_vol_m3, fill=Formtn_name)) +
geom_col() +
ggtitle("Total oil production by formation") +
xlab("Formation") +
ylab("Oil production [m3]")
# plot gas production
prod_by_formation %>%
subset(Formtn_name %in% best_formations) %>%
ggplot(aes(x = Formtn_name, y = gas_prod_total, fill=Formtn_name)) +
geom_col() +
ggtitle("Total gas production by formation") +
xlab("Formation") +
ylab("Gas production [e3m3]")
# plot condensate production
prod_by_formation %>%
subset(Formtn_name %in% best_formations) %>%
ggplot(aes(x = Formtn_name, y = cond_prod_total, fill=Formtn_name)) +
geom_col() +
ggtitle("Total condensate production by formation") +
xlab("Formation") +
ylab("Condensate production [m3]")
# plot water production
prod_by_formation %>%
subset(Formtn_name %in% best_formations) %>%
ggplot(aes(x = Formtn_name, y = water_prod_total, fill=Formtn_name)) +
geom_col() +
ggtitle("Total water production by formation") +
xlab("Formation") +
ylab("Water production [m3]")
Line graphs are useful to visualize the change in time. We can visualize the trends in production for example.
# reminder: available formation names in our dataframe
unique(wells$Formtn_name)[1:20]
## [1] "QUATERNARY" "CRETACEOUS" "CARDIUM SAND"
## [4] "DOE CREEK" "DUNVEGAN" "BASE OF FISH SCALES"
## [7] "SCATTER" "PADDY" "PADDY- CADOTTE"
## [10] "CADOTTE" "SPIRIT RIVER" "NOTIKEWIN"
## [13] "FALHER" "FALHER A" "FALHER B"
## [16] "FALHER C" "FALHER D" "WILRICH"
## [19] "BLUESKY" "BASAL BLUESKY"
# let's visualize only one formation first
formtnName = "BLUESKY"
prod_df %>%
filter(Formtn_name == formtnName) %>%
group_by(Prod_ym, Formtn_name, Prod_type) %>%
summarise(
total_prod = sum(Prod_volume)
) %>%
ungroup() %>%
ggplot(aes(x = Prod_ym, y = total_prod, group=Formtn_name, col=Prod_type)) +
geom_line() +
scale_x_discrete(breaks = c("2016-01", "2017-01","2018-01",
"2019-01","2020-01", "2021-01")) +
facet_grid(Prod_type ~ ., scales = "free") +
ggtitle(paste("Production per year for formation", formtnName))
## `summarise()` has grouped output by 'Prod_ym', 'Formtn_name'. You can override using the `.groups` argument.
# and now production from all formations in one chart
prod_df %>%
group_by(Prod_ym, Prod_type) %>%
summarise(
total_prod = sum(Prod_volume)
) %>%
ungroup() %>%
ggplot(aes(x = Prod_ym, y = total_prod, group=1, col=Prod_type)) +
geom_line() +
scale_x_discrete(breaks = c("2016-01", "2017-01","2018-01",
"2019-01","2020-01", "2021-01")) +
facet_grid(Prod_type ~ ., scales = "free") +
ggtitle("Total production per year")
## `summarise()` has grouped output by 'Prod_ym'. You can override using the `.groups` argument.
let’s see how facets can increase the visibility of our charts (and see the example of geom_bar as well)
#library(tidyverse)
# here we plot the count of wells per year for all formations
ggplot(data = wells, aes(x = Prod_year)) +
geom_bar()
# filter wells for most productive formations and most recent wells. Add some colors (fill) selected manually (scale_fill_manual)
ggplot(filter(wells, Formtn_name %in% best_formations & Prod_year < 2021)) +
geom_bar(aes(x = Formtn_name, fill=Formtn_name)) +
scale_fill_manual(values = c('#d0d1e6','#a6bddb','#67a9cf','#1c9099','#016c59')) + # taken e.g. from colorbrewer
facet_wrap(~ Prod_year) +
theme(axis.text.x = element_blank(), # remove ticks #cosmetics
axis.ticks.x = element_blank(),
axis.title.x = element_blank())
Making interactive plots in R is extremely easy. First, we need to create standard “static” chart and then feed it to plotly.
library(plotly)
head(prod_df)
## Water_prod_vol_m3 Prod_volume Prod_year Prod_ym Prod_type Prod_days
## 1 91.4 0 2018 2018-09 Gas_prod_vol_e3m3 30.0
## 2 1291.0 0 2020 2020-03 Gas_prod_vol_e3m3 30.7
## 3 2528.0 0 2019 2019-01 Gas_prod_vol_e3m3 30.8
## 4 1068.0 0 2017 2017-09 Gas_prod_vol_e3m3 23.0
## 5 6948.7 0 2018 2018-05 Gas_prod_vol_e3m3 31.0
## 6 109.3 0 2020 2020-01 Gas_prod_vol_e3m3 31.0
## Formtn_name
## 1 QUATERNARY
## 2 QUATERNARY
## 3 QUATERNARY
## 4 QUATERNARY
## 5 QUATERNARY
## 6 QUATERNARY
# let's filter the data first and summarize the production
prod_per_period <- prod_df %>%
filter(Formtn_name %in% best_formations) %>%
group_by(Prod_ym, Prod_type, Formtn_name) %>%
summarise(
prod_per_period = sum(Prod_volume)
)
prod_per_period %>% arrange(Prod_type, Formtn_name)
## # A tibble: 915 x 4
## # Groups: Prod_ym, Prod_type [183]
## Prod_ym Prod_type Formtn_name prod_per_period
## <chr> <chr> <chr> <dbl>
## 1 2016-01 Cond_prod_vol_m3 BALDONNEL 786.
## 2 2016-02 Cond_prod_vol_m3 BALDONNEL 631.
## 3 2016-03 Cond_prod_vol_m3 BALDONNEL 457.
## 4 2016-04 Cond_prod_vol_m3 BALDONNEL 442.
## 5 2016-05 Cond_prod_vol_m3 BALDONNEL 534.
## 6 2016-06 Cond_prod_vol_m3 BALDONNEL 423
## 7 2016-07 Cond_prod_vol_m3 BALDONNEL 368.
## 8 2016-08 Cond_prod_vol_m3 BALDONNEL 405.
## 9 2016-09 Cond_prod_vol_m3 BALDONNEL 460.
## 10 2016-10 Cond_prod_vol_m3 BALDONNEL 385.
## # ... with 905 more rows
# let's plot it according to the production type
p <- ggplot(prod_per_period, aes(x=Prod_ym, y=prod_per_period, group=Formtn_name, fill=Formtn_name, text = paste("Formation:", Formtn_name, "<br>Production vol:", prod_per_period))) +
geom_area(alpha=0.3) +
scale_x_discrete(breaks = c("2016-01", "2017-01","2018-01", "2019-01","2020-01", "2021-01")) +
facet_grid(Prod_type ~ ., scales = "free")
p <- ggplotly(p, tooltip = "text")
p
# That's nice but let's add some details, like text when we hover over charts.
p <- ggplot(prod_per_period, aes(x=Prod_ym, y=prod_per_period, group=Prod_type,
text = paste("Formation:", Formtn_name,
"<br>Prod. type:", Prod_type), fill=Prod_type)) +
geom_area(colour="#636363", alpha=0.3) +
scale_x_discrete(breaks = c("2016-01", "2017-01","2018-01",
"2019-01","2020-01", "2021-01")) +
facet_wrap(Formtn_name ~ ., scales = "free") +
ylab("") +
xlab("")
p <- ggplotly(p, tooltip = "text")
p
Let’s move to another dataset provided by BCOGC to access the location of the wells and plot them together with geophysical lines, O&G pools and fields. O&G field => group of O&G pools
Here we will use geom_sf geometry and sf package, commonly used for geospatial data manipulation. We fill use following sf functions: st_as_sf() - convert foreign object to the sf object st_crs() - get coordinate system of the feature st_transform() - transform features (for example change coordinate system) st_union() - merge multiple features into one st_intersects() - select intersecting features st_convex_hull() - create polygon around the points (minimum bounding area) st_within() - get features within other feature st_crop() - crop the feature
rm(list = ls(all=TRUE)) # clear global environment
sprintf("Current directory: %s", getwd())
## [1] "Current directory: C:/Users/Paulina-laptop/Desktop/R_programming/RLadies_workshop"
library(sf) # install.packages(sf) if you don't have it
# load dplyr and ggplot2 in case this is the only chunk you want to run
library(dplyr)
library(ggplot2, warn.conflicts = FALSE)
wells_coords <- read.csv(unzip("BCOGC_data/drill_csv.zip", "wells.csv"), skip = 1)
head(wells_coords)#bigger file
## Well.Surf.Loc Well.Name
## 1 D- 080-H/092-G-03 ROYAL CITY NO. 1 D- 080-H/092-G-03
## 2 B- 050-D/082-G-01 CANADIAN KOOTENAY NO. 1 B- 050-D/082-G-01
## 3 D- 055-A/082-G-02 BORDER OILS NO. 1 D- 055-A/082-G-02
## 4 13-11-081-17 CANLIN PINGEL 13-11-081-17
## 5 A- 065-E/093-I-16 CVE ENERGY ET AL LONE MOUNTAIN NO. 1 A- 065-E/093-I-16
## 6 C- 048-D/103-G-05 STRATHCONA QUEEN CHARLOTTE NO.1 C- 048-D/103-G-05
## WA.Num Surf.Nad27.Lat Surf.Nad27.Long Surf.Nad83.Lat Surf.Nad83.Long
## 1 1 49084860 123064913 49085314 123065320
## 2 2 49020730 114294872 49035335 114497851
## 3 3 49025250 114331128 49047892 114554121
## 4 4 NA NA 56004511 120331605
## 5 5 54530889 120254600 54530863 120255126
## 6 6 53172250 131590333 53171412 131575692
## Surf.UTM.Zone.Num Surf.UTM83.Northng Surf.UTM83.Eastng Surf.North Surf.East
## 1 10 5443925 491629.8 -192.0 -71.3
## 2 11 5434400 682875.8 NA NA
## 3 11 5435662 678718.6 NA NA
## 4 10 6210173 652456.4 -201.2 221.3
## 5 10 6085099 664789.3 274.9 -285.3
## 6 9 5908329 302312.1 -463.9 -107.9
## Surf.Owner Ground.Elevtn Directional.Flag Surf.Ref.Corner Surf.Ref.Unit
## 1 CROWN 1.5 N NE 80
## 2 CROWN NA N NW 50
## 3 CROWN NA N SW 55
## 4 CROWN 758.6 N NW NA
## 5 CROWN 971.6 N SE 65
## 6 CROWN 8.3 N NE 48
## Surf.Ref.Block Surf.Ref.Map Surf.Ref.Lsd Surf.Ref.Sect Surf.Ref.Twp
## 1 H 092-G-03 NA NA NA
## 2 D 082-G-01 NA NA NA
## 3 A 082-G-02 NA NA NA
## 4 13 11 81
## 5 E 093-I-16 NA NA NA
## 6 D 103-G-05 NA NA NA
## Surf.Ref.Range Surf.DLS.Exception Surf.Lsd Surf.Sect Surf.Twp Surf.Range
## 1 NA NA NA NA NA
## 2 NA NA NA NA NA
## 3 NA NA NA NA NA
## 4 17 13 11 81 17
## 5 NA NA NA NA NA
## 6 NA NA NA NA NA
## Surf.Qtr.Unit Surf.NTS.Exception Surf.Unit Surf.Block Surf.Map Oper.Id
## 1 D 80 H 092-G-03 0
## 2 B 50 D 082-G-01 0
## 3 D 55 A 082-G-02 0
## 4 NA 1018
## 5 A 65 E 093-I-16 1016
## 6 C 48 D 103-G-05 11075
## Oper.Abbrev Oper.Abbrev2 Optnl.Unit Well.Area.Name Well.Name.Date
## 1 ROYAL CITY NO. 1 19480610
## 2 CANADIAN KOOTENAY NO. 1 19490903
## 3 BORDER OILS NO. 1 19480704
## 4 CANLIN PINGEL 20171206
## 5 CVE ENERGY ET AL LONE MOUNTAIN NO. 1 20180607
## 6 STRATHCONA QUEEN CHARLOTTE NO.1 20200901
## Special.Well.Class.Code Test.Hole
## 1 NO N
## 2 NO N
## 3 N
## 4 NO N
## 5 NO N
## 6 NO N
names(wells_coords)
## [1] "Well.Surf.Loc" "Well.Name"
## [3] "WA.Num" "Surf.Nad27.Lat"
## [5] "Surf.Nad27.Long" "Surf.Nad83.Lat"
## [7] "Surf.Nad83.Long" "Surf.UTM.Zone.Num"
## [9] "Surf.UTM83.Northng" "Surf.UTM83.Eastng"
## [11] "Surf.North" "Surf.East"
## [13] "Surf.Owner" "Ground.Elevtn"
## [15] "Directional.Flag" "Surf.Ref.Corner"
## [17] "Surf.Ref.Unit" "Surf.Ref.Block"
## [19] "Surf.Ref.Map" "Surf.Ref.Lsd"
## [21] "Surf.Ref.Sect" "Surf.Ref.Twp"
## [23] "Surf.Ref.Range" "Surf.DLS.Exception"
## [25] "Surf.Lsd" "Surf.Sect"
## [27] "Surf.Twp" "Surf.Range"
## [29] "Surf.Qtr.Unit" "Surf.NTS.Exception"
## [31] "Surf.Unit" "Surf.Block"
## [33] "Surf.Map" "Oper.Id"
## [35] "Oper.Abbrev" "Oper.Abbrev2"
## [37] "Optnl.Unit" "Well.Area.Name"
## [39] "Well.Name.Date" "Special.Well.Class.Code"
## [41] "Test.Hole"
# subset dataframe
wells_coords <- wells_coords %>% select(Surf.UTM83.Northng, Surf.UTM83.Eastng, Surf.UTM.Zone.Num, Well.Area.Name, Oper.Abbrev, Surf.Owner)
# select only wells from the UNT10 and UTM11 zones
wells_coords <- wells_coords %>% filter(complete.cases(.), Surf.UTM.Zone.Num %in% c(10,11))
unique(wells_coords$Surf.UTM.Zone.Num)
## [1] 10 11
# create 2 geodataframes with crs values according to UTM zone
wells_pts_1 <- wells_coords %>%
filter(Surf.UTM.Zone.Num == 10) %>%
st_as_sf(coords = c("Surf.UTM83.Eastng", "Surf.UTM83.Northng"), crs = 26910) %>%
st_transform(4326) # want to use latlon coords
wells_pts_2 <- wells_coords %>%
filter(Surf.UTM.Zone.Num == 11) %>%
st_as_sf(coords = c("Surf.UTM83.Eastng", "Surf.UTM83.Northng"), crs = 26910) %>%
st_transform(4326) # want to use latlon coords
# bind dataframe rows vertically and check the coordinate system
wells_pts <- rbind(wells_pts_1, wells_pts_2)
st_crs(wells_pts)
## Coordinate Reference System:
## User input: EPSG:4326
## wkt:
## GEOGCRS["WGS 84",
## DATUM["World Geodetic System 1984",
## ELLIPSOID["WGS 84",6378137,298.257223563,
## LENGTHUNIT["metre",1]]],
## PRIMEM["Greenwich",0,
## ANGLEUNIT["degree",0.0174532925199433]],
## CS[ellipsoidal,2],
## AXIS["geodetic latitude (Lat)",north,
## ORDER[1],
## ANGLEUNIT["degree",0.0174532925199433]],
## AXIS["geodetic longitude (Lon)",east,
## ORDER[2],
## ANGLEUNIT["degree",0.0174532925199433]],
## USAGE[
## SCOPE["Horizontal component of 3D system."],
## AREA["World."],
## BBOX[-90,-180,90,180]],
## ID["EPSG",4326]]
# now let's load geoophysical lines
geoph_lines <- st_read("BCOGC_data/shapefiles/PASR_GEOPHYSICAL_LN_subset.shp")
## Reading layer `PASR_GEOPHYSICAL_LN_subset' from data source `C:\Users\Paulina-laptop\Desktop\R_programming\RLadies_workshop\BCOGC_data\shapefiles\PASR_GEOPHYSICAL_LN_subset.shp' using driver `ESRI Shapefile'
## Simple feature collection with 49710 features and 19 fields
## Geometry type: MULTILINESTRING
## Dimension: XY
## Bounding box: xmin: 1266576 ymin: 1173555 xmax: 1374678 ymax: 1265680
## Projected CRS: NAD83 / BC Albers
# we will focus only on the lines within Dawson Area
geoph_lines_dawson <- filter(geoph_lines, PROG_NAME %in% c("DAWSON 3D","DAWSON 3D EXTENSION"))
# let's set the same crs as the wells
geoph_lines_dawson <- st_transform(geoph_lines_dawson, st_crs(wells_pts))
# merge the lines and create the polygon to filter the wells
dawson_area <- st_union(geoph_lines_dawson) %>%
st_convex_hull()
## although coordinates are longitude/latitude, st_union assumes that they are planar
# filter the wells
wells_dawson <- wells_pts %>%
filter(st_within(x = ., y = dawson_area, sparse = FALSE))
## although coordinates are longitude/latitude, st_within assumes that they are planar
# plot together
ggplot() +
geom_sf(data = wells_dawson, size=1, alpha=0.2, colour='blue') +
geom_sf(data = geoph_lines_dawson, size=0.1)
# let's add O&G pools
pools <- st_read("BCOGC_data/shapefiles/POOL_CONTOURS_LN.shp")
## Reading layer `POOL_CONTOURS_LN' from data source `C:\Users\Paulina-laptop\Desktop\R_programming\RLadies_workshop\BCOGC_data\shapefiles\POOL_CONTOURS_LN.shp' using driver `ESRI Shapefile'
## Simple feature collection with 7963 features and 12 fields
## Geometry type: MULTILINESTRING
## Dimension: XY
## Bounding box: xmin: 1091006 ymin: 1070402 xmax: 1386930 ymax: 1679933
## Projected CRS: NAD83 / BC Albers
# get the production pools intersecting the Dawson area only
pools <- st_transform(pools, st_crs(wells_dawson))
pools_dawson <- pools %>%
filter(st_intersects(x = ., y = dawson_area, sparse = FALSE))
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
# plot it
ggplot() +
geom_sf(data = wells_dawson, alpha=0.2) +
geom_sf(data = geoph_lines_dawson, alpha=0.2) +
geom_sf(data = pools_dawson, aes(colour=FLUID_TYPE)) +
scale_color_discrete(name = "Producing pools") +
xlab("Longitude") +
ylab("Latitude")
# Let's add one more information:O&G fields. Transform to the crs of the wells
fields <- st_read("BCOGC_data/shapefiles/FIELDS_PY.shp")
## Reading layer `FIELDS_PY' from data source `C:\Users\Paulina-laptop\Desktop\R_programming\RLadies_workshop\BCOGC_data\shapefiles\FIELDS_PY.shp' using driver `ESRI Shapefile'
## Simple feature collection with 570 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension: XY
## Bounding box: xmin: 1079392 ymin: 1066850 xmax: 1387498 ymax: 1680540
## Projected CRS: NAD83 / BC Albers
ggplot() +
geom_sf(data = fields)
fields <- st_transform(fields, st_crs(wells_dawson))
# let's focus on the fields in the smaller area (crop the layer)
fields <- st_crop(fields, xmin = -121, xmax = -120, ymin = 55.7, ymax = 56)
## although coordinates are longitude/latitude, st_intersection assumes that they are planar
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
unique(fields$FLDRNM)
## [1] "MICA" "PARKLAND" "DOE" "DAWSON CREEK"
## [5] "GROUNDBIRCH" "SATURN" "SUNSET PRAIRIE" "BRIAR RIDGE"
## [9] "BRASSEY" "TOWER LAKE" "SUNDOWN" "SUNRISE"
# let's visualize only Dawson field
field_dawson <- fields %>% filter(FLDRNM == "DAWSON CREEK")
# let's plot it together
ggplot() +
geom_sf(data = wells_dawson, alpha=0.2) +
geom_sf(data = geoph_lines_dawson, alpha=0.2) +
geom_sf(data = pools, aes(colour=FLUID_TYPE)) +
geom_sf(data = field_dawson, fill=NA, linetype="dashed", colour="blue", size=0.2) +
scale_color_discrete(name = "Producing pools") +
coord_sf(crs = 4326, xlim = c(-120.5,-120), ylim = c(55.7, 56)) +
xlab("Longitude") +
ylab("Latitude")